library(knitr)
library(fpp3)
library(readxl)
library(httr)
library(glue)
#install.packages("corrr")
library(corrr)
library(slider)
library(forcats)
#devtools::install_github('smin95/smplot2', force = TRUE)
library(smplot2)
#install.packages('sf')
library(sf)
#install.packages('Rcpp')
library(Rcpp)
#install.packages('mapview')
library(mapview)
Import Data & Create tsibble
df<-readr::read_csv('df_bike_violations_2023-06-29.csv')
Rows: 122620 Columns: 14── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): VIOLATION_CODE, VEH_CATEGORY, CITY_NM, Location, DESCRIPTION
dbl (8): EVNT_KEY, RPT_OWNING_CMD, X_COORD_CD, Y_COORD_CD, Latitude, Longitude, daily_total_cyclists, weekly_total_cyclists
dttm (1): VIOLATION_DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df<-df %>%
mutate(VIOLATION_CODE = as.factor(VIOLATION_CODE)) %>%
mutate(VEH_CATEGORY = as.factor(VEH_CATEGORY)) %>%
mutate(CITY_NM = as.factor(CITY_NM)) %>%
mutate(RPT_OWNING_CMD = as.factor(RPT_OWNING_CMD)) %>%
mutate(VIOLATION_DATE = as.POSIXct(VIOLATION_DATE))
# create a daily data tsibble
ts_daily_total <- df %>%
mutate(VIOLATION_DATE = as_date(VIOLATION_DATE)) %>%
group_by(VIOLATION_DATE, daily_total_cyclists) %>%
summarize(daily_total_violations = sum(n())) %>%
ungroup() %>%
as_tsibble(index = VIOLATION_DATE)
`summarise()` has grouped output by 'VIOLATION_DATE'. You can override using the `.groups` argument.
Clean daily tsibble
# check for missing dates # ans: 24
ts_daily_total %>%
#filter(is.na(daily_total_cyclists)) %>%
count_gaps()
# are the days missing from df_counts? # ans: no.
#df_counts %>%
# filter(is.na(daily_total))
# fill gaps dates and re-join with df_counts data, so that gapped dates now have total cyclsits. Replace violation na's with 0s.
ts_daily_total<-ts_daily_total %>%
fill_gaps() %>%
select(-daily_total_cyclists) %>%
mutate(date = VIOLATION_DATE) %>%
left_join(y=df_counts, by='date', multiple='any') %>%
rename(daily_total_cyclists = daily_total) %>%
select(-c('date', 'counts', 'weekly_total')) %>%
mutate_at('daily_total_violations', ~replace_na(., 0))
# double check for na's:
ts_daily_total %>%
filter(is.na(daily_total_cyclists))
# double check for date gaps:
ts_daily_total %>%
count_gaps()
NA
Create monthly data tsibble
# create a monthly data tsibble from cleaned daily tsibble
ts_monthly_total<-ts_daily_total %>%
index_by(yearmonth(VIOLATION_DATE)) %>%
summarize(across(c('daily_total_violations', 'daily_total_cyclists'), ~sum(.x, na.rm = TRUE))) %>%
rename(monthly_total_cyclists = daily_total_cyclists) %>%
rename(monthly_total_violations = daily_total_violations) %>%
rename(VIOLATION_DATE='yearmonth(VIOLATION_DATE)')
EDA
EDA: General
What’s the date range for the data?
date(range(df$VIOLATION_DATE))
[1] "2018-01-01" "2023-03-31"
Which dates had the highest number of riders, and how many?
df %>%
mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>%
arrange(desc(daily_total_cyclists)) %>%
distinct(VIOLATION_DATE, daily_total_cyclists) %>%
slice(1:10)
# takeaway: no particular trends, but mostly late summer / early autumn. 4 from September 2022.
Which dates had the highest number of violations, and how many
violations?
df %>%
mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>%
group_by(VIOLATION_DATE) %>%
summarize(daily_total_violations = sum(n())) %>%
arrange(desc(daily_total_violations)) %>%
slice(1:10)
# mostly warmer days in 2018 and 2019, pre-pandemic
EDA: Violation Codes
How many different violations were handed out to cyclist?
df %>%
distinct(VIOLATION_CODE)
# ans: 184
How many total violations were handed out to cyclists?
length(df$VIOLATION_CODE)
[1] 122620
# there were zero nas.
#sum(is.na(df$VIOLATION_CODE))
# ans: 122620
What were the most popular violations?
df %>%
group_by(VIOLATION_CODE, DESCRIPTION) %>%
summarize(sum = sum(n())) %>%
arrange(desc(sum)) %>%
ungroup() %>%
mutate(percent = 100*round(sum/sum(sum), 3)) %>%
slice(1:10)
`summarise()` has grouped output by 'VIOLATION_CODE'. You can override using the `.groups` argument.
# takewaways: 44% were for failing to stop at red light.
# note: 12332:ATTACHING SELF TO MOVING MOTOR VEHICLE a/k/a, the Marty McFly rule
EDA: daily cyclists
Plot: daily total cyclists over time
ts_daily_total %>%
autoplot(daily_total_cyclists)+
stat_smooth(method = "lm")

# takeaway: ridership did not change much pre or post pandemic. not even during lockdown, although possible hickup.
# However, ridership seems to be increaseing in the colder months while remaining level in the warmer months.
Plot: daily total cyclists histogram
ts_daily_total %>%
ggplot(aes(x=daily_total_cyclists))+
geom_histogram(bins=80)

Does ridership increase or decrease yearly?
# leave out 2023 since we only have up to March
ts_daily_total %>%
index_by(year(VIOLATION_DATE)) %>%
summarize(across(c('daily_total_violations', 'daily_total_cyclists'), ~sum(.x, na.rm = TRUE))) %>%
rename(yearly_total_cyclists = daily_total_cyclists) %>%
rename(yearly_total_violations = daily_total_violations) %>%
rename(VIOLATION_DATE='year(VIOLATION_DATE)') %>%
filter_index(.~'2022') %>%
autoplot(yearly_total_cyclists)

# Takeaway: Yes, it increases. slight dip in 2019 but clearly increasing
Does ridership change throughout the year (seasonality)?
# yearly seasonal plot
ts_daily_total %>%
gg_season(daily_total_cyclists)

# takeaway - it's roughly the same yearly cycle per year. increase in ridership from May-Oct, decrease Nov-Apr
# same as above but monthly totals, looks cleaner:
ts_monthly_total %>%
gg_season(monthly_total_cyclists)

# takeaway - dip in April 2020 during covid lockdown, but not much less than before.
# looking at it by month:
ts_monthly_total %>%
gg_subseries(monthly_total_cyclists)

# takeaways:
## seasonality is a little clearer. monthly difference is clearer.
## 2023 is turning out to be noticeably more riders than previous years.
# Ridership changes over time takeways:
## Ridership is increasing.
## Clear yearly seasonality, with May-Oct being the busiest, and Nov-Apr being less busy.
## Not much visual difference pre- and post- lockdown/pandemic.
Daily total cyclists decomposition
# classical decomp doesnt handle multi seasonality too well
ts_daily_total %>%
# filter(!is.na(daily_total_cyclists)) %>%
# fill_gaps() %>%
model(classical_decomposition(daily_total_cyclists, type = "additive")) %>%
components() %>%
autoplot()

# STL - best so far
ts_daily_total %>%
model(
STL(daily_total_cyclists ~ trend(window = 365) +
season(period='year')+season(period='1 month'),
)) %>%
components() %>%
autoplot()

# what about moving average for trend? it ok. I think stl is best.
ts_daily_total %>%
mutate(`5-MA` = slider::slide_dbl(daily_total_cyclists, mean, .before = 90, .after = 90, .complete = TRUE)) %>%
autoplot(daily_total_cyclists)+
geom_line(aes(y = `5-MA`), colour = "#D55E00")

# what about just a regular regression line
ts_daily_total %>%
autoplot(daily_total_cyclists)+
stat_smooth(method = "lm")

NA
NA
EDA: daily violations
Plot: daily total violations over time
ts_daily_total %>%
autoplot(daily_total_violations)

# takeaway: There were more violations before the pandemic. During lockdown, very few.
Plot: daily total violations historgram
ts_daily_total %>%
ggplot(aes(x=daily_total_violations))+
geom_histogram(bins=80)

Are the yearly number of violations trending up or down?
ts_daily_total %>%
#filter_index('2020'~.) %>%
index_by(year(VIOLATION_DATE)) %>%
summarize(across(c('daily_total_violations', 'daily_total_cyclists'), ~sum(.x, na.rm = TRUE))) %>%
rename(yearly_total_cyclists = daily_total_cyclists) %>%
rename(yearly_total_violations = daily_total_violations) %>%
rename(VIOLATION_DATE='year(VIOLATION_DATE)') %>%
filter_index(.~'2022') %>%
autoplot(yearly_total_violations)

# takeaway: not reall, post-covid. It looks steady. 2020, even with lockdown, had more violations than 2021 and 2022, but less than 2019.
# monthly: just cleaner graph of daily violations over time. Post-covid looks steady.
ts_monthly_total %>%
autoplot(monthly_total_violations)

Do number of violations change throughout the year (seasonal)?
# just post-covid
ts_daily_total %>%
filter_index('2020-03'~.) %>%
gg_season(daily_total_violations)

# is it easier to see monthly?
ts_monthly_total %>%
filter_index('2020-03'~.) %>%
gg_season(monthly_total_violations)

# subseries plot
ts_monthly_total %>%
filter_index('2020-03'~.) %>%
gg_subseries(monthly_total_violations)

# takeaway - Hard to see any trends, but fewer violations in Nov/Dec/Jan. Most in Sept Oct. 2021 was a steady year.
As the number of riders increases, does the number of violations
also increase?
# utilizes corrr library
df %>%
mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>%
group_by(VIOLATION_DATE) %>%
mutate(daily_total_violations = sum(n())) %>%
select(daily_total_cyclists, daily_total_violations) %>%
correlate()
Adding missing grouping variables: `VIOLATION_DATE`Non-numeric variables removed from input: `VIOLATION_DATE`Correlation computed with
• Method: 'pearson'
• Missing treated using: 'pairwise.complete.obs'
# takeaawy cor: 0.2159162 So no, as the number of riders increases, the number of violations does not.
Plot: daily violations vs. daily cylists
ts_daily_total %>%
ggplot(aes(x=daily_total_violations, y=daily_total_cyclists))+
geom_point()

Does this differ for pre-covid and post-lockdown?
Plot: Pre-covid

Plot: Post-covid
# correlation: 0.3846224
ts_daily_total %>%
filter_index("2020-04"~.) %>%
select(daily_total_cyclists, daily_total_violations) %>%
correlate()
Non-numeric variables removed from input: `VIOLATION_DATE`Correlation computed with
• Method: 'pearson'
• Missing treated using: 'pairwise.complete.obs'
Do largest violations (summed per day) change over time?
df %>%
mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>%
filter(fct_lump=='1111D1C') %>%
mutate(date = date(VIOLATION_DATE)) %>%
filter(date>='2020-04-01') %>%
group_by(date, fct_lump) %>%
summarize(sum=sum(n())) %>%
ggplot(aes(x=date, y=sum))+
geom_line()
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

To Do: Did the type of violations change after covid?
To do: Geocode violations. See where most of them occur.
To do: Can we predict ridership?
To do: Can we predict daily violations?
scratchpad
# cyclist counters per borough:
#library(forcats)
df_bicycle_counters_boroughs %>%
ggplot(aes(x=fct_infreq(Borough))) +
geom_bar()+
labs(x = "Borough", y="Cyclist Counters")
Error in `geom_bar()`:
! Problem while computing aesthetics.
ℹ Error occurred in the 1st layer.
Caused by error in `check_factor()`:
! object 'Borough' not found
Backtrace:
1. base (local) `<fn>`(x)
18. forcats::fct_infreq(Borough)
19. forcats:::check_factor(f)

---
title: "R Notebook"
output: html_notebook
---


```{r message=FALSE}
library(knitr)
library(fpp3)
library(readxl)
library(httr)
library(glue)
#install.packages("corrr")
library(corrr)
library(slider)
library(forcats)
#devtools::install_github('smin95/smplot2', force = TRUE)
library(smplot2)
#install.packages('sf')
library(sf)
#install.packages('Rcpp')
library(Rcpp)
#install.packages('mapview')
library(mapview)
```


## Import Data & Create tsibble
```{r}
df<-readr::read_csv('df_bike_violations_2023-06-29.csv')

df<-df %>% 
  mutate(VIOLATION_CODE = as.factor(VIOLATION_CODE)) %>%
  mutate(VEH_CATEGORY = as.factor(VEH_CATEGORY)) %>% 
  mutate(CITY_NM = as.factor(CITY_NM)) %>% 
  mutate(RPT_OWNING_CMD = as.factor(RPT_OWNING_CMD)) %>% 
  mutate(VIOLATION_DATE = as.POSIXct(VIOLATION_DATE)) 


# create a daily data tsibble
ts_daily_total <- df %>% 
  mutate(VIOLATION_DATE = as_date(VIOLATION_DATE)) %>% 
  group_by(VIOLATION_DATE, daily_total_cyclists) %>% 
  summarize(daily_total_violations = sum(n())) %>% 
  ungroup() %>% 
  as_tsibble(index = VIOLATION_DATE)

```
### Clean daily tsibble
```{r}
# check for missing dates  # ans: 24
ts_daily_total %>% 
  #filter(is.na(daily_total_cyclists)) %>% 
  count_gaps()

# are the days missing from df_counts?  # ans: no.
#df_counts %>%  
#  filter(is.na(daily_total))



# fill gaps dates and re-join with df_counts data, so that gapped dates now have total cyclsits. Replace violation na's with 0s.
ts_daily_total<-ts_daily_total %>%
  fill_gaps() %>%
  select(-daily_total_cyclists) %>% 
  mutate(date = VIOLATION_DATE) %>% 
  left_join(y=df_counts, by='date', multiple='any') %>% 
  rename(daily_total_cyclists = daily_total) %>% 
  select(-c('date', 'counts', 'weekly_total')) %>% 
  mutate_at('daily_total_violations', ~replace_na(., 0))


# double check for na's:  
ts_daily_total %>% 
  filter(is.na(daily_total_cyclists))

# double check for date gaps:
ts_daily_total %>% 
  count_gaps()

```

### Create monthly data tsibble

```{r}
# create a monthly data tsibble from cleaned daily tsibble
ts_monthly_total<-ts_daily_total %>% 
  index_by(yearmonth(VIOLATION_DATE)) %>% 
  summarize(across(c('daily_total_violations', 'daily_total_cyclists'),  ~sum(.x, na.rm = TRUE))) %>% 
  rename(monthly_total_cyclists = daily_total_cyclists) %>% 
  rename(monthly_total_violations = daily_total_violations) %>% 
  rename(VIOLATION_DATE='yearmonth(VIOLATION_DATE)')

```


# EDA

## EDA: General

### What's the date range for the data? 
```{r}
date(range(df$VIOLATION_DATE))
```
### Which dates had the highest number of riders, and how many?
```{r}
df %>% 
  mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>% 
  arrange(desc(daily_total_cyclists)) %>%
  distinct(VIOLATION_DATE, daily_total_cyclists) %>% 
  slice(1:10)

# takeaway: no particular trends, but mostly late summer / early autumn.  4 from September 2022.
```

### Which dates had the highest number of violations, and how many violations?
```{r}
df %>% 
  mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>% 
  group_by(VIOLATION_DATE) %>% 
  summarize(daily_total_violations = sum(n())) %>% 
  arrange(desc(daily_total_violations)) %>% 
  slice(1:10)


# mostly warmer days in 2018 and 2019, pre-pandemic
```
## EDA: Violation Codes

### How many different violations were handed out to cyclist?
```{r}
df %>% 
  distinct(VIOLATION_CODE)
  
# ans: 184
```
### How many total violations were handed out to cyclists?
```{r}
length(df$VIOLATION_CODE)

# there were zero nas.  
#sum(is.na(df$VIOLATION_CODE))


# ans: 122620
```


### What were the most popular violations?

```{r}
df %>% 
  group_by(VIOLATION_CODE, DESCRIPTION) %>% 
  summarize(sum = sum(n())) %>% 
  arrange(desc(sum)) %>% 
  ungroup() %>% 
  mutate(percent = 100*round(sum/sum(sum), 3)) %>% 
  slice(1:10)

  
  # takewaways:  44% were for failing to stop at red light.

# note:  12332:ATTACHING SELF TO MOVING MOTOR VEHICLE a/k/a, the Marty McFly rule

```






## EDA: daily cyclists

### Plot: daily total cyclists over time
```{r}
ts_daily_total %>% 
  autoplot(daily_total_cyclists)+
  stat_smooth(method = "lm")

# takeaway: ridership did not change much pre or post pandemic. not even during lockdown, although possible hickup.
# However, ridership seems to be increaseing in the colder months while remaining level in the warmer months.
```
### Plot: daily total cyclists histogram
```{r}
ts_daily_total %>% 
  ggplot(aes(x=daily_total_cyclists))+
  geom_histogram(bins=80)
```




### Does ridership increase or decrease yearly?
```{r}
# leave out 2023 since we only have up to March

ts_daily_total %>% 
  index_by(year(VIOLATION_DATE)) %>% 
  summarize(across(c('daily_total_violations', 'daily_total_cyclists'),  ~sum(.x, na.rm = TRUE))) %>% 
  rename(yearly_total_cyclists = daily_total_cyclists) %>% 
  rename(yearly_total_violations = daily_total_violations) %>% 
  rename(VIOLATION_DATE='year(VIOLATION_DATE)') %>% 
  filter_index(.~'2022') %>% 
  autoplot(yearly_total_cyclists)

# Takeaway: Yes, it increases. slight dip in 2019 but clearly increasing

```


### Does ridership change throughout the year (seasonality)?
```{r}
# yearly seasonal plot
ts_daily_total %>% 
  gg_season(daily_total_cyclists)
# takeaway - it's roughly the same yearly cycle per year.  increase in ridership from May-Oct, decrease Nov-Apr

# same as above but monthly totals, looks cleaner:
ts_monthly_total %>% 
  gg_season(monthly_total_cyclists)
# takeaway - dip in April 2020 during covid lockdown, but not much less than before.


# looking at it by month:
ts_monthly_total %>% 
  gg_subseries(monthly_total_cyclists)
# takeaways:
## seasonality is a little clearer.  monthly difference is clearer.
## 2023 is turning out to be noticeably more riders than previous years.


# Ridership changes over time takeways:
## Ridership is increasing.
## Clear yearly seasonality, with May-Oct being the busiest, and Nov-Apr being less busy.
## Not much visual difference pre- and post- lockdown/pandemic.  

```

### Daily total cyclists decomposition
```{r}

# classical decomp doesnt handle multi seasonality too well
ts_daily_total %>% 
#  filter(!is.na(daily_total_cyclists)) %>% 
#  fill_gaps() %>% 
  model(classical_decomposition(daily_total_cyclists, type = "additive")) %>% 
	components() %>%
  autoplot()


# STL - best so far
ts_daily_total 	%>%
	model(
    STL(daily_total_cyclists ~ trend(window = 365) +
                   season(period='year')+season(period='1 month'),
    )) %>%
  components() %>% 
  autoplot()

# what about moving average for trend?  it ok.  I think stl is best.
ts_daily_total %>% 
    mutate(`5-MA` = slider::slide_dbl(daily_total_cyclists, mean, .before = 90, .after = 90, .complete = TRUE)) %>%
    autoplot(daily_total_cyclists)+
    geom_line(aes(y = `5-MA`), colour = "#D55E00")

# what about just a regular regression line
ts_daily_total %>% 
  autoplot(daily_total_cyclists)+
  stat_smooth(method = "lm")


```

## EDA: daily violations

### Plot: daily total violations over time
```{r}

ts_daily_total %>% 
  autoplot(daily_total_violations)

# takeaway:  There were more violations before the pandemic.  During lockdown, very few.

```

### Plot: daily total violations historgram
```{r}
ts_daily_total %>% 
  ggplot(aes(x=daily_total_violations))+
  geom_histogram(bins=80)
```
### Are the yearly number of violations trending up or down?
```{r}
ts_daily_total %>% 
  #filter_index('2020'~.) %>% 
  index_by(year(VIOLATION_DATE)) %>% 
  summarize(across(c('daily_total_violations', 'daily_total_cyclists'),  ~sum(.x, na.rm = TRUE))) %>% 
  rename(yearly_total_cyclists = daily_total_cyclists) %>% 
  rename(yearly_total_violations = daily_total_violations) %>% 
  rename(VIOLATION_DATE='year(VIOLATION_DATE)') %>% 
  filter_index(.~'2022') %>% 
  autoplot(yearly_total_violations)
  

# takeaway: not reall, post-covid. It looks steady.  2020, even with lockdown, had more violations than 2021 and 2022, but less than 2019.

# monthly:  just cleaner graph of daily violations over time.  Post-covid looks steady.
ts_monthly_total %>% 
  autoplot(monthly_total_violations)

```



### Do number of violations change throughout the year (seasonal)?
```{r}
# just post-covid
ts_daily_total %>% 
  filter_index('2020-03'~.) %>% 
  gg_season(daily_total_violations)


# is it easier to see monthly?
ts_monthly_total %>% 
  filter_index('2020-03'~.) %>% 
  gg_season(monthly_total_violations)

# subseries plot
ts_monthly_total %>% 
  filter_index('2020-03'~.) %>% 
  gg_subseries(monthly_total_violations)

# takeaway - Hard to see any trends, but fewer violations in Nov/Dec/Jan.  Most in Sept Oct.  2021 was a steady year.
```




### As the number of riders increases, does the number of violations also increase?
```{r}

# utilizes corrr library

df %>% 
  mutate(VIOLATION_DATE = date(VIOLATION_DATE)) %>% 
  group_by(VIOLATION_DATE) %>% 
  mutate(daily_total_violations = sum(n())) %>% 
  select(daily_total_cyclists, daily_total_violations) %>% 
  correlate()

# takeaawy cor: 0.2159162 So no, as the number of riders increases, the number of violations does not. 

```
### Plot: daily violations vs. daily cylists
```{r}
ts_daily_total %>% 
  ggplot(aes(x=daily_total_violations, y=daily_total_cyclists))+
  geom_point()

```

### Does this differ for pre-covid and post-lockdown?
#### Plot: Pre-covid
```{r}

# correlation:  0.501744
cor<-ts_daily_total %>% 
  filter_index(~"2020-04") %>% 
  select(daily_total_cyclists, daily_total_violations) %>% 
  correlate() %>% 
  slice(1) %>% 
  select(daily_total_violations) %>% 
  as.numeric()


# pre-covid
ts_daily_total %>% 
  filter_index(~"2020-04") %>% 
  ggplot(aes(x=daily_total_violations, y=daily_total_cyclists))+
  geom_point()+
  geom_smooth(method=lm)+
  sm_statCorr(label_x = 250, label_y = 1000)
  
```

#### Plot: Post-covid
```{r}
#devtools::install_github('smin95/smplot2', force = TRUE)
#library(smplot2)

# post-covid
ts_daily_total %>% 
  filter_index("2020-04"~.) %>% 
  ggplot(aes(x=daily_total_violations, y=daily_total_cyclists))+
  geom_point()+
  geom_smooth(method=lm)+
  sm_statCorr(label_x = 120, label_y = 1000)

# correlation:  0.3846224
ts_daily_total %>% 
  filter_index("2020-04"~.) %>% 
  select(daily_total_cyclists, daily_total_violations) %>% 
  correlate()

# ans: they differ a little more than all together.  pre-covid was a better indicator.
```

### Do largest violations (summed per day) change over time?
```{r}
# code test to lump all except 10 highest factors into 'other' category
df %>% 
  mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  count(fct_lump) %>% 
  arrange(desc(n))

# plot: daily biggest violation over time, post covid
df %>% 
  mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  filter(fct_lump=='1111D1C') %>% 
  mutate(date = date(VIOLATION_DATE)) %>% 
  filter(date>='2020-04-01') %>% 
  group_by(date, fct_lump) %>% 
  summarize(sum=sum(n())) %>% 
  ggplot(aes(x=date, y=sum))+
  geom_line()


# takeaway:  No.  not much different than general daily violations plot.  lower amount around Dec/Jan, otherwise fairly steady. 

```


### To Do: Did the type of violations change after covid?

```{r}

```


### To do: Geocode violations.  See where most of them occur. 
```{r}


df_fct_lump_post_covid <- df %>% 
  mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  mutate(date=date(VIOLATION_DATE)) %>% 
  filter(date>='2020-04-01')


df %>% 
  #mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  mutate(date=date(VIOLATION_DATE)) %>% 
  select(-VIOLATION_DATE) %>% 
  filter(date>='2020-04-01') %>% 
  mapview(na.omit(df), xcol = "Longitude", ycol = "Latitude",crs = 4269, grid=FALSE, alpha.regions = 0.2)


mapview(na.omit(df_fct_lump_post_covid), xcol = "Longitude", ycol = "Latitude",zcol='fct_lump', crs = 4269, grid=FALSE, alpha.regions = 0.1)

# map is too full,  let's just view 2022-04 through 2023-03

df_fct_lump_prev_year<-df %>% 
  mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  mutate(date=date(VIOLATION_DATE)) %>% 
  filter(date>='2022-04-01')

df_fct_lump_prev_year %>% 
  count(fct_lump) %>% 
  arrange(desc(n))

mapview(na.omit(df_fct_lump_prev_year), xcol = "Longitude", ycol = "Latitude",zcol='fct_lump', crs = 4269, grid=FALSE, alpha.regions = 0.8)
# takeaway: better, but still not as clear as I'd like.  


# Let's just view fct_lump==1111D1C for the previous year:
df_fct_lump_prev_year %>% 
  filter(fct_lump=='1111D1C') %>% 
  mapview(xcol = "Longitude", ycol = "Latitude",crs = 4269, grid=FALSE, alpha.regions = 0.2)


df %>% 
  mutate(fct_lump = fct_lump(f=df$VIOLATION_CODE, n=9)) %>% 
  mutate(date=date(VIOLATION_DATE)) %>% 
  filter(date>='2022-04-01') %>% 
  filter(fct_lump=='1111D1C') %>% 
  mapview(xcol = "Longitude", ycol = "Latitude",crs = 4269, grid=FALSE, alpha.regions = 0.2)

# takeaway: we have a better view of the major ticketing areas for 1111D1C: uws, ues (and their corresponding avenues up through Harlem), up and down 1st and 2nd ave, especialy in the east village, over the Wburg Bridge, up and down 4th and 5th aves in Brooklyn, Liberty Ave in Ozone park/ Richmond Hill, East side of prospect park on Bedford Ave, and bensonhurst, bk.
```



### To do: Can we predict ridership?
```{r}

```


### To do: Can we predict daily violations?
```{r}

```


#### scratchpad
```{r}
str(df)

# plot shows that ebikes and escooters didnt have their own categories until sometime in mid 2022. before that, everything was under 'bikes'
df %>% 
  ggplot(aes(x=VIOLATION_DATE, y=CITY_NM, col=VEH_CATEGORY))+
  geom_jitter()

#nuthin
df %>% 
  ggplot(aes(x=VIOLATION_DATE, y=CITY_NM, col=VIOLATION_CODE))+
  geom_jitter()

# violations per month
df %>% 
  group_by(yearmonth=yearmonth(VIOLATION_DATE)) %>% 
  summarize(total_violations =  sum(n())) %>% 
  ggplot(aes(x=yearmonth, y=total_violations))+
  geom_line()


# violations per borough per year
df %>% 
  group_by(year=yearmonth(VIOLATION_DATE), CITY_NM) %>% 
  summarize(total_violations =  sum(n())) %>% 
  ggplot(aes(x=year, y=total_violations, col=CITY_NM))+
  geom_line()


# cyclist counters per borough:
#library(forcats)
df_bicycle_counters_boroughs %>% 
  ggplot(aes(x=fct_infreq(Borough))) +
  geom_bar()+
  labs(x = "Borough", y="Cyclist Counters")
```

